{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# A demo of K-Means clustering on the handwritten digits data\n\n\nIn this example we compare the various initialization strategies for\nK-means in terms of runtime and quality of the results.\n\nAs the ground truth is known here, we also apply different cluster\nquality metrics to judge the goodness of fit of the cluster labels to the\nground truth.\n\nCluster quality metrics evaluated (see `clustering_evaluation` for\ndefinitions and discussions of the metrics):\n\n=========== ========================================================\nShorthand    full name\n=========== ========================================================\nhomo         homogeneity score\ncompl        completeness score\nv-meas       V measure\nARI          adjusted Rand index\nAMI          adjusted mutual information\nsilhouette   silhouette coefficient\n=========== ========================================================\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\nfrom time import time\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import metrics\nfrom sklearn.cluster import KMeans\nfrom sklearn.datasets import load_digits\nfrom sklearn.decomposition import PCA\nfrom sklearn.preprocessing import scale\n\nnp.random.seed(42)\n\nX_digits, y_digits = load_digits(return_X_y=True)\ndata = scale(X_digits)\n\nn_samples, n_features = data.shape\nn_digits = len(np.unique(y_digits))\nlabels = y_digits\n\nsample_size = 300\n\nprint(\"n_digits: %d, \\t n_samples %d, \\t n_features %d\"\n      % (n_digits, n_samples, n_features))\n\n\nprint(82 * '_')\nprint('init\\t\\ttime\\tinertia\\thomo\\tcompl\\tv-meas\\tARI\\tAMI\\tsilhouette')\n\n\ndef bench_k_means(estimator, name, data):\n    t0 = time()\n    estimator.fit(data)\n    print('%-9s\\t%.2fs\\t%i\\t%.3f\\t%.3f\\t%.3f\\t%.3f\\t%.3f\\t%.3f'\n          % (name, (time() - t0), estimator.inertia_,\n             metrics.homogeneity_score(labels, estimator.labels_),\n             metrics.completeness_score(labels, estimator.labels_),\n             metrics.v_measure_score(labels, estimator.labels_),\n             metrics.adjusted_rand_score(labels, estimator.labels_),\n             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),\n             metrics.silhouette_score(data, estimator.labels_,\n                                      metric='euclidean',\n                                      sample_size=sample_size)))\n\nbench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),\n              name=\"k-means++\", data=data)\n\nbench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),\n              name=\"random\", data=data)\n\n# in this case the seeding of the centers is deterministic, hence we run the\n# kmeans algorithm only once with n_init=1\npca = PCA(n_components=n_digits).fit(data)\nbench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),\n              name=\"PCA-based\",\n              data=data)\nprint(82 * '_')\n\n# #############################################################################\n# Visualize the results on PCA-reduced data\n\nreduced_data = PCA(n_components=2).fit_transform(data)\nkmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)\nkmeans.fit(reduced_data)\n\n# Step size of the mesh. Decrease to increase the quality of the VQ.\nh = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].\n\n# Plot the decision boundary. For that, we will assign a color to each\nx_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1\ny_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1\nxx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n\n# Obtain labels for each point in mesh. Use last trained model.\nZ = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])\n\n# Put the result into a color plot\nZ = Z.reshape(xx.shape)\nplt.figure(1)\nplt.clf()\nplt.imshow(Z, interpolation='nearest',\n           extent=(xx.min(), xx.max(), yy.min(), yy.max()),\n           cmap=plt.cm.Paired,\n           aspect='auto', origin='lower')\n\nplt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)\n# Plot the centroids as a white X\ncentroids = kmeans.cluster_centers_\nplt.scatter(centroids[:, 0], centroids[:, 1],\n            marker='x', s=169, linewidths=3,\n            color='w', zorder=10)\nplt.title('K-means clustering on the digits dataset (PCA-reduced data)\\n'\n          'Centroids are marked with white cross')\nplt.xlim(x_min, x_max)\nplt.ylim(y_min, y_max)\nplt.xticks(())\nplt.yticks(())\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}